require(lfe)
require(sandwich)

dataset.A <- read.csv("datasetA.csv")
dataset.B <- read.csv("datasetB.csv")
dataset.C <- read.csv("datasetC.csv")

# function to calcurate summary statistics
summary.table <- function(x, digits = 2) {
  round(c(mean(x, na.rm = TRUE), median(x, na.rm = TRUE), 
          sd(x, na.rm = TRUE), min(x, na.rm = TRUE), 
          max(x, na.rm = TRUE)), digits)
}

# function to add asterisks for significance
significance <- function(p.value) {
  star <- ifelse(p.value < 0.01, "**", 
                 ifelse(p.value < 0.05, "*", ""))
  star
}

#### Main Text --- DESIGN ####
## comparison of the number of candidates between national and local districts
# number of candidates
number.of.national.candidates <- table(dataset.A$election)
number.of.local.candidates <- table(dataset.B$prefecture.year.id)

# percentage of local districts where the number of local candidates was in single digits
round(sum(number.of.local.candidates < 10) * 100 / length(number.of.local.candidates))

# average number of local candidates
round(mean(number.of.local.candidates), 1)

# average number of national candidates
round(mean(number.of.national.candidates), 1)

#### Main Text --- RESULTS ####
## number of observations
nrow(dataset.C)

## number of dummy variables
# number of candidate-year interaction dummies
length(unique(dataset.C$candidate.year.id))

# number of candidates
length(unique(dataset.C$candidate.id))

# number of elections
length(unique(dataset.C$election))

# number of surname-prefecture interaction dummies
length(unique(dataset.C$surname.prefecture.id))

# number of surnames
length(unique(dataset.C$surname.id))

# number of prefectures
length(unique(dataset.C$prefecture))

## point estimate of the coefficient, SE, and p-value (p. 7)
# estimation
main.result <- felm(log.vote.share ~ same.surname | 
                      candidate.year.id + surname.prefecture.id | 0 | prefecture.year.id, 
                    dataset.C)

# point estimate
round(main.result$coefficients, 3)

# standard error (clustered by the prefecture-year)
round(main.result$cse, 3)

# p-value
main.result$pval

## interpretation
# average (non-logged) Vote Share in a low name recognition prefecture
vote.share.low.prefecture <- mean(subset(dataset.C, same.surname == 0)$vote.share)
round(vote.share.low.prefecture, 3)

# annual average number of national candidates
number.of.national.candidates <- tapply(dataset.C$candidate.id, dataset.C$election, function(x) length(unique(x)))
average.number.of.national.candidates <- mean(number.of.national.candidates)
round(average.number.of.national.candidates, 1)
round(1 / average.number.of.national.candidates, 4)

# Vote Share in a high name recognition prefecture
high.low.ratio <- exp(main.result$coefficients)
round(high.low.ratio, 2)

vote.share.high.prefecture <- vote.share.low.prefecture * high.low.ratio
round(vote.share.high.prefecture, 2)

# confidence interval
CI.lower <- exp(main.result$coef + qnorm(0.025) * main.result$cse)
CI.upper <- exp(main.result$coef + qnorm(0.975) * main.result$cse)

# Figure 1
pdf("Figure_1.pdf", width = 3, height = 3, paper = "special", pointsize = 7)
par(mar = c(5, 6, 2, 2))
plot(NULL, NULL, type = "n", bty = "l", xlim = c(0.5, 2.5), ylim = c(0, 1.25), xlab = "Name recognition", 
     ylab = "Vote Share (%)", 
     xaxt = "n", yaxt = "n")
abline(h = 0, lty = 3)
polygon(c(0.7, 1.3, 1.3, 0.7), c(0, 0, vote.share.low.prefecture, vote.share.low.prefecture), col = gray(0.8))
polygon(c(1.7, 2.3, 2.3, 1.7), 
        c(0, 0, vote.share.high.prefecture, vote.share.high.prefecture), col = gray(0.5))
arrows(2, vote.share.low.prefecture * CI.lower, 2, vote.share.low.prefecture * CI.upper, 
       length = 0.02, angle = 90, code = 3)
axis(1, at = c(1, 2), labels = c("Low", "High"))
axis(2, at = c(0, 0.5, 1))
dev.off()

#### Online Appendix --- DATA ####
## summary statistics of the number of candidates (p. 3)
# Table A.1
number.of.national.candidates <- table(dataset.A$election)
number.of.local.candidates <- table(dataset.B$prefecture.year.id)

Table.A1 <- cbind(rbind(summary.table(number.of.national.candidates, digits = 1), 
                        summary.table(number.of.local.candidates, digits = 1)), 
                  c(length(number.of.national.candidates), 
                    length(number.of.local.candidates)))
rownames(Table.A1) <- c("National", "Local")
colnames(Table.A1) <- c("Mean", "Median", "SD", "Min", "Max", "N")

Table.A1

## summary statistics of the datasets (p. 3)
# omit parties all of whose national candidates take on the same value of National Same Surname (see fn. 5 of the Online Appendix)
average.national.same.surname <- tapply(dataset.A$national.same.surname, dataset.A$party, mean)  # average of National Same Surname for each party
average.national.same.surname[which(average.national.same.surname == 0 | average.national.same.surname == 1)]  # parties such that average of National Same Surname is zero or one
dataset.A.for.analysis <- subset(dataset.A, ! party %in% c(19, 22, 23, 39, 57, 79))  # omit parties Nos. 19, 22, 23, 39, 57, and 79

# Table A.2
variables.for.Table.A2.Dataset.A <- data.frame(dataset.A.for.analysis$national.same.surname,  # National Same Surname
                                               dataset.A.for.analysis$national.vote.share)  # National Vote Share (%)
Table.A2.Dataset.A <- t(apply(variables.for.Table.A2.Dataset.A, 2, summary.table))

variables.for.Table.A2.Dataset.B <- data.frame(dataset.B$local.same.surname,  # Local Same Surname
                                               dataset.B$local.vote.share,  # Local Vote Share (%)
                                               dataset.B$number.of.local.seats)  # Number Of Local Seats 
Table.A2.Dataset.B <- rbind(t(apply(variables.for.Table.A2.Dataset.B, 2, summary.table)), 
                            summary.table(subset(dataset.B, local.same.surname == 1)$fellow.national.incumbent))  # Fellow National Incumbent

variables.for.Table.A2.Dataset.C <- data.frame(dataset.C$same.surname,  # Same Surname
                                               dataset.C$vote.share,  # Vote Share (%)
                                               dataset.C$incumbency,  # Incumbency
                                               dataset.C$female,  # Female
                                               dataset.C$party.dummy == "DSP",  # DSP
                                               dataset.C$party.dummy == "JSP",  # JSP                                      
                                               dataset.C$party.dummy == "DPJ",  # DPJ
                                               dataset.C$party.dummy == "CGP",  # CGP
                                               dataset.C$party.dummy == "JCP",  # JSP
                                               dataset.C$party.dummy == "Minor",  # Minor
                                               dataset.C$party.dummy == "Independent")  # Independent
variables.for.Table.A2.subset.C <- data.frame(subset(dataset.C, same.surname == 1)$same.party,  # Same Party
                                              subset(dataset.C, same.surname == 1)$birth.place,  # Birth Place
                                              subset(dataset.C, same.surname == 1)$fellow.local.incumbent)  # Fellow Local Incumbent
Table.A2.Dataset.C <- rbind(t(apply(variables.for.Table.A2.Dataset.C, 2, summary.table)), 
                            t(apply(variables.for.Table.A2.subset.C, 2, summary.table)))

Table.A2 <- rbind(Table.A2.Dataset.A, Table.A2.Dataset.B, Table.A2.Dataset.C)
rownames(Table.A2) <- c("National Same Surname", "National Vote Share (%)", 
                                  "Local Same Surname", "Local Vote Share (%)", 
                                  "Number Of Local Seats", "Fellow National Incumbent", 
                                  "Same Surname", "Vote Share (%)", "Incumbency", "Female", 
                                  "DSP", "JSP", "DPJ", "CGP", "JCP", "Minor", "Independent", 
                                  "Same Party", "Birth Place", "Fellow Local Incumbent")
colnames(Table.A2) <- c("Mean", "Median", "SD", "Min", "Max")

Table.A2  # Table A2
nrow(dataset.A.for.analysis)  # number of observations for Dataset A
nrow(dataset.B)  # number of observations for Dataset B
nrow(subset(dataset.B, local.same.surname == 1))  # number of observations such that Local Same Surname = 1
nrow(dataset.C)  # number of observations for Dataset C
nrow(subset(dataset.C, same.surname == 1))  # number of observations such that Same Surname = 1

#### Online Appendix --- SUPPLEMENTARY ARGUMENT ####
## comparison of the number of candidates between national and local districts (p. 3)
# number of candidates
number.of.national.candidates <- table(dataset.A$election)
number.of.local.candidates <- table(dataset.B$prefecture.year.id)

# percentage of local districts where the number of local candidates was in single digits
round(sum(number.of.local.candidates < 10) * 100 / length(number.of.local.candidates))

# average number of local candidates
round(mean(number.of.local.candidates), 1)

# average number of national candidates
round(mean(number.of.national.candidates), 1)

#### Online Appendix --- SUPPLEMENTARY ANALYSES --- Validity of Exogeneity Assumption ####
## probability that two randomly chosen national and local candidates share a surname (p. 5)
random.isonymy <- list()
for (i in 1:12) {
  subdata.national <- subset(dataset.A, election == i)
  subdata.local <- subset(dataset.B, election == i)
  # record 1 if a national candidate (row) and a local candidate (column) share the surname and 0 otherwise
  random.isonymy[[i]] <- outer(as.character(subdata.national$surname.id), as.character(subdata.local$surname.id), 
                               function(x, y) ifelse(x == y, 1, 0))
}
for (i in 13:17) {
  subdata.national <- subset(dataset.A, election == i + 6)
  subdata.local <- subset(dataset.B, election == i + 6)
  # record 1 if a national candidate (row) and a local candidate (column) share the surname and 0 otherwise
  random.isonymy[[i]] <- outer(as.character(subdata.national$surname.id), as.character(subdata.local$surname.id), 
                               function(x, y) ifelse(x == y, 1, 0))
}

# probability for each election
random.isonymy.prob <- rep(NA, 17)
for (i in 1:17) {
  random.isonymy.prob[i] <- sum(random.isonymy[[i]]) / length(random.isonymy[[i]])
}
round(random.isonymy.prob * 100, 2)
round(min(random.isonymy.prob) * 100, 2)  # maximum
round(max(random.isonymy.prob) * 100, 2)  # minimum

## exogeneity check 1 (pp. 6-7)
# omit parties all of whose national candidates take on the same value of National Same Surname (fn. 5)
average.national.same.surname <- tapply(dataset.A$national.same.surname, dataset.A$party, mean)  # average of National Same Surname for each party
average.national.same.surname[which(average.national.same.surname == 0 | average.national.same.surname == 1)]  # parties such that average of National Same Surname is zero or one
dataset.A.for.analysis <- subset(dataset.A, ! party %in% c(19, 22, 23, 39, 57, 79))  # omit parties Nos. 19, 22, 23, 39, 57, and 79

# number of observations
nrow(dataset.A)
nrow(dataset.A.for.analysis)

# estimation
exogeneity.check.1.result <- glm(national.same.surname ~ national.vote.share + 
                                   as.factor(party) + as.factor(election) - 1, 
                                 family = binomial, dataset.A.for.analysis)

# compute the clustered robust standard error by year
g <- estfun(exogeneity.check.1.result)
h <- vcov(exogeneity.check.1.result)
g.cluster <- matrix(NA, length(unique(dataset.A.for.analysis$election)), length(exogeneity.check.1.result$coefficients))
for (i in 1:ncol(g.cluster)) {
  g.cluster[, i] <- tapply(g[, i], dataset.A.for.analysis$election, sum)
}
clustered.vcov <- h %*% 
  ((length(unique(dataset.A.for.analysis$election)) / (length(unique(dataset.A.for.analysis$election)) - 1)) * 
     crossprod(g)) %*% h
exogeneity.check.1.coefficient <- exogeneity.check.1.result$coefficients[1]
exogeneity.check.1.clustered.SE <- sqrt(diag(clustered.vcov))[1]
exogeneity.check.1.p.value <- 2 * (pnorm(-abs(exogeneity.check.1.coefficient / exogeneity.check.1.clustered.SE)))

# Table A.4
Table.A4 <- matrix("", 2, 3)
Table.A4[1, 1] <- sprintf("%5.3f", exogeneity.check.1.coefficient)
Table.A4[1, 2] <- significance(exogeneity.check.1.p.value)
Table.A4[1, 3] <- paste0("(", sprintf("%5.3f", exogeneity.check.1.clustered.SE), ")")
Table.A4[2, 1] <- nrow(dataset.A.for.analysis)
rownames(Table.A4) <- c("National Vote Share", "N")
print(Table.A4, quote = FALSE)

## exogeneity check 2 (pp. 7-8)
# number of observations
nrow(dataset.B)

# 2.5, 50, and 97.5 percentiles of Local Vote Share (fn. 11 of the Online Appendix)
round(quantile(dataset.B$local.vote.share, probs = c(0.025, 0.5, 0.975)), 2)

# estimation
exogeneity.check.2.result <- felm(log.local.vote.share ~ local.same.surname + log.number.of.local.seats | 
                                    party + election + surname.id | 0 | prefecture.year.id, dataset.B)

# results
summary(exogeneity.check.2.result)

# Table A.5
Table.A5 <- matrix("", 3, 3)
Table.A5[1, 1] <- sprintf("%5.3f", exogeneity.check.2.result$coefficients[1])
Table.A5[1, 2] <- significance(exogeneity.check.2.result$cpval[1])
Table.A5[1, 3] <- paste0("(", sprintf("%5.3f", exogeneity.check.2.result$cse[1]), ")")
Table.A5[2, 1] <- sprintf("%5.3f", exogeneity.check.2.result$coefficients[2])
Table.A5[2, 2] <- significance(exogeneity.check.2.result$cpval[2])
Table.A5[2, 3] <- paste0("(", sprintf("%5.3f", exogeneity.check.2.result$cse[2]), ")")
Table.A5[3, 1] <- nrow(dataset.B)
rownames(Table.A5) <- c("Local Same Surname", "log(Number Of Local Seats)", "N")
print(Table.A5, quote = FALSE)

#### Online Appendix --- SUPPLEMENTARY ANALYSES --- Effects of Shared Surname on Vote Share ####
## estimation (pp. 8-10)
# number of observations
nrow(dataset.C)

# number of dummy variables
length(unique(dataset.C$candidate.id))  # number of candidates
length(unique(dataset.C$election))  # number of elections
length(unique(dataset.C$candidate.id)) * length(unique(dataset.C$election))
length(unique(dataset.C$candidate.year.id))  # number of candidate-year interaction dummies

length(unique(dataset.C$surname.id))  # number of surnames
length(unique(dataset.C$prefecture))  # number of prefectures
length(unique(dataset.C$surname.id)) * length(unique(dataset.C$prefecture))
length(unique(dataset.C$surname.prefecture.id))  # number of surname-prefecture interaction dummies

length(unique(dataset.C$surname.id)) - 
  length(unique(subset(dataset.C, election > 1)$surname.id))  # number of surnames appear only when there were 45 contested local districts
length(unique(subset(dataset.C, election > 1)$surname.id)) - 
  length(unique(subset(dataset.C, election > 9)$surname.id))  # number of surnames appear only when there were 46 local districts

# number of surnames used by only one national candidate
family.name.distributuion <- table(dataset.A$surname.id)
sum(family.name.distributuion == 1)
round(sum(family.name.distributuion == 1) / length(family.name.distributuion), 3)

# Figure A.1 (distribution of the number of national candidates who use the same surname)
pdf("Figure_A1.pdf", width = 5, height = 3, paper = "special", pointsize = 7)
par(mar = c(4, 4, 2, 2))
hist(family.name.distributuion, breaks = seq(0, 35, by = 1), freq = FALSE, 
     main = "", xlab = "Number of national candidates who use the same surname", 
     ylab = "Distribution (%)", yaxt = "n")
axis(2, at = seq(0, 0.5, by = 0.1), labels = seq(0, 50, by = 10))
dev.off()

# maximum number of national candidates with the same surname
max(family.name.distributuion)

# 2.5, 50, and 97.5 percentiles of Vote Share
round(quantile(dataset.C$vote.share, probs = c(0.025, 0.5, 0.975)), 3)

# number of observations with zero votes
sum(dataset.C$number.of.votes == 0)

# estimation
main.result <- felm(log.vote.share ~ same.surname | 
                      candidate.year.id + surname.prefecture.id | 0 | prefecture.year.id, 
                    dataset.C)

# exponentiated coefficient estimate and its 95% CI
high.low.ratio <- exp(main.result$coefficients)
round(high.low.ratio, 2)  # point estimate

CI.lower <- exp(main.result$coef + qnorm(0.025) * main.result$cse)
round(CI.lower, 2)  # lower bound od the 95% CI

CI.upper <- exp(main.result$coef + qnorm(0.975) * main.result$cse)
round(CI.upper, 2)  # upper bound od the 95% CI

## evaluation 1: whether the same-surname effect can influence the electoral results (pp. 10-11)
# vote share margins between the national-level last winner and the runner-up national candidate
dataset.C.SNTV <- subset(dataset.C, election < 13)  # use SNTV candidates only

margin.SNTV <- margin.SNTV.pp <- rep(NA, 12)
for (i in 1:12) {
  subdata <- subset(dataset.A, election == i)
  margin.SNTV[i] <- min(subset(subdata, win == 1)$number.of.votes) - 
    max(subset(subdata, win == 0 | win == 2 | win == 3)$number.of.votes)
  margin.SNTV.pp[i] <- margin.SNTV[i] * 100 / 
    dataset.C.SNTV$number.of.national.votes[dataset.C.SNTV$election == i][1]
}

# smallest actual margin
min.margin.SNTV <- min(margin.SNTV)
min.margin.SNTV  # number of votes
1944 + (which(margin.SNTV == min.margin.SNTV) * 3) # year
round(min(margin.SNTV.pp), 4)  # percentage points

# mean actual margin
round(mean(margin.SNTV.pp), 2)  # percentage points

# number of votes of runner-up candidates in each prefecture
runner.up.vote <- matrix(NA, 12, 47)
for (i in 1:12) {
  subdata <- subset(dataset.A, election == i & (win == 0 | win == 2 | win == 3))  # extract losers
  runner.up <- subdata[which(subdata$number.of.votes == max(subdata$number.of.votes)), ]$candidate.id  # loser who obtained the highest number of votes
  subdata2 <- subset(dataset.C.SNTV, election == i & candidate.id == runner.up)  # extract the runner-up national candidates
  if (i == 1) {
    runner.up.vote[i, 1:45] <- subdata2$number.of.votes
    runner.up.vote[i, 1:45][subdata2$same.surname == 1] <- NA  # exclude districts where there are local candidates who share the surname with the runner-up national candidate
  } else if (i < 10) {
    runner.up.vote[i, 1:46] <- subdata2$number.of.votes
    runner.up.vote[i, 1:46][subdata2$same.surname == 1] <- NA  # exclude districts where there are local candidates who share the surname with the runner-up national candidate
  } else {
    runner.up.vote[i, 1:47] <- subdata2$number.of.votes
    runner.up.vote[i, 1:47][subdata2$same.surname == 1] <- NA  # exclude districts where there are local candidates who share the surname with the runner-up national candidate
  }
}

# the number of low name recognition prefecture of every runner-up national candidate under SNTV
sum(is.na(runner.up.vote) == 0)

# possibility of reversal
reversal <- c(NA, 12)
reversal.prefecture <- matrix(NA, 12, 47)
for (i in 1:12) {
  # number of districts where the result could be reversed
  reversal[i] <- sum(runner.up.vote[i, ] * (high.low.ratio - 1) > margin.SNTV[i], na.rm = TRUE)
  # record TRUE if the result could be reversed for each district and election
  for (j in 1:47) {
    if (is.na(runner.up.vote[i, j])) next
    reversal.prefecture[i, j] <- runner.up.vote[i, j] * (high.low.ratio - 1) > margin.SNTV[i]
  }
}
round(sum(reversal) / sum(! is.na(runner.up.vote)), 2)  # proportion of the number of districts where the result could be reversed
1944 + (which(reversal.prefecture[, 31]) * 3)  # Tottori prefecture as an example

# Figure A2 (distribution of the would-be nationwide vote share margins)
wouldbe.vote.share.margin <- matrix(NA, 12, 47)
for (i in 1:12) {
  subdata <- subset(dataset.A, election == i)
  last.winner.share <- min(subset(subdata, win == 1)$national.vote.share) / 100  # vote share of the last winner
  for (j in 1:(ifelse(i == 1, 45, ifelse(i < 10, 46, 47)))) {
    runner.up.share <- (max(subset(subdata, win == 0 | win == 2 | win == 3)$number.of.votes) + 
                          runner.up.vote[i, j] * (high.low.ratio - 1)) / 
      subset(dataset.C.SNTV, election == i)$number.of.national.votes[1]
    wouldbe.vote.share.margin[i, j] <- (runner.up.share - last.winner.share) * 100
  }
}
wouldbe.vote.share.margin.hist <- hist(wouldbe.vote.share.margin, 
                                       breaks = seq(-0.10, 0.25, by = 0.01), plot = FALSE)

pdf("Figure_A2.pdf", width = 4.5, height = 3, paper = "special", pointsize = 7)
par(mar = c(5, 4, 2, 2))
hist(wouldbe.vote.share.margin, breaks = seq(-0.10, 0.25, by = 0.01), freq = FALSE, 
     col = gray(0.8), border = gray(0.8), main = "", xlab = "", ylab = "Distribution (%)", yaxt = "n")
for (i in 1:25) {
  if (wouldbe.vote.share.margin.hist$density[10 + i] > 0) {
    polygon(c(0.01 * i - 0.01, 0.01 * i, 0.01 * i, 0.01 * i - 0.01), 
            c(0, 0, wouldbe.vote.share.margin.hist$density[10 + i], wouldbe.vote.share.margin.hist$density[10 + i]), 
            col = gray(0.5), border = gray(0.5))
  }
}
abline(v = 0, lty = 3)
axis(2, at = c(0, 10, 20, 30))
mtext("Would-be nationwide vote share margins (%)\n (= runner up share - last winner share)", 
      side = 1, line = 4)
dev.off()

## evaluation 2: comparison with other variables (pp. 11-12)
# estimation
dataset.C$party.dummy <- factor(dataset.C$party.dummy,  # set the base category of Party as the LDP
                                levels = c("LDP", "DSP", "JSP", "DPJ", "CGP", "JCP", "Minor", "Independent"))

result.comparison <- felm(log.vote.share ~ same.surname + incumbency + female + party.dummy | 
                            election + surname.prefecture.id | 0 | prefecture.year.id, dataset.C)

# results
round(summary(result.comparison)$coefficients, 3)

# absolute value of the estimated coefficient of Same Surname
abs(round(result.comparison$coefficients[1], 3))

# absolute value of the estimated coefficient of JSP
abs(round(result.comparison$coefficients[5], 3))

# absolute value of the estimated coefficient of DPJ
abs(round(result.comparison$coefficients[6], 3))

# Figure A3
Figure.A3.point <- exp(result.comparison$coefficients)
Figure.A3.upper <- exp(result.comparison$coefficients + qnorm(0.975) * result.comparison$cse)
Figure.A3.lower <- exp(result.comparison$coefficients + qnorm(0.025) * result.comparison$cse)
Figure.A3.labels <- c("Same Surname", "Incumbency", "Female", 
                      "DSP", "JSP", "DPJ", "CGP", "JCP", "Minor", "Independent")

pdf("Figure_A3.pdf", width = 6, height = 3, paper = "special", pointsize = 7)
par(mar = c(5, 2, 2, 2))
plot(NULL, NULL, bty = "n", xlim = c(0, 2),
     ylim = c(-11, 0), xlab = "Exponentiated Estimates of Coeffiecients",ylab = "", yaxt = "n")
for(i in 1:10){
  points(Figure.A3.point[i], -i, pch = 19)
  lines(x = c(Figure.A3.lower[i], Figure.A3.upper[i]), y = rep(-i, 2))
  text(Figure.A3.point[i], -i, Figure.A3.labels[i], pos = 3, cex = 0.8)
}
abline(v = 1, lty = 3)
dev.off()

#### Online Appendix --- ROBUSTNESS CHECKS --- Party ####
## p. 13
# number of observations such that Same Party = 1
sum(dataset.C$same.party)
round(sum(dataset.C$same.party) * 100 / sum(dataset.C$same.surname), 1)
sum(dataset.C$same.surname)

# probability that two randomly chosen national and local candidates in each election belong to the same party
same.party <- list()
for (i in 1:12) {
  subdata.national <- subset(dataset.A, election == i)
  subdata.local <- subset(dataset.B, election == i)
  # record 1 if a national candidate (row) and a local candidate (column) belong to the same party and 0 otherwise
  same.party[[i]] <- outer(subdata.national$party, subdata.local$party, function(x, y) ifelse(x == y, 1, 0))
  # record 0 if a national candidate is an independent
  for (j in 1:nrow(subdata.national)) {
    if (subdata.national$independent[j] == 1)
      same.party[[i]][j, ] <- 0
  }
}
for (i in 13:17) {
  subdata.national <- subset(dataset.A, election == i + 6)
  subdata.local <- subset(dataset.B, election == i + 6)
  # record 1 if a national candidate (row) and a local candidate (column) share the surname and 0 otherwise
  same.party[[i]] <- outer(subdata.national$party, subdata.local$party, function(x, y) ifelse(x == y, 1, 0))
}

round(sum(unlist(same.party)) * 100 / length(unlist(same.party)), 1)

# estimation
main.result <- felm(log.vote.share ~ same.surname | 
                      candidate.year.id + surname.prefecture.id | 0 | prefecture.year.id, 
                    dataset.C)

same.party.result <- felm(log.vote.share ~ same.surname + same.surname:same.party | 
                            candidate.year.id + surname.prefecture.id | 0 | prefecture.year.id, 
                          dataset.C)

# Table A.6
Table.A6 <- matrix("", 3, 6)
Table.A6[1, 1] <- sprintf("%5.3f", main.result$coefficients)
Table.A6[1, 2] <- significance(main.result$cpval)
Table.A6[1, 3] <- paste0("(", sprintf("%5.3f", main.result$cse), ")")
Table.A6[1, 4] <- sprintf("%5.3f", same.party.result$coefficients[1])
Table.A6[1, 5] <- significance(same.party.result$cpval[1])
Table.A6[1, 6] <- paste0("(", sprintf("%5.3f", same.party.result$cse[1]), ")")
Table.A6[2, 4] <- sprintf("%5.3f", same.party.result$coefficients[2])
Table.A6[2, 5] <- significance(same.party.result$cpval[2])
Table.A6[2, 6] <- paste0("(", sprintf("%5.3f", same.party.result$cse[2]), ")")
Table.A6[3, 1] <- Table.A6[3, 4] <- nrow(dataset.C)
rownames(Table.A6) <- c("Same Surname", "Same Surname * Same Party", "N")
colnames(Table.A6) <- c("Equation (1)", "", "", "Equation (A.1)", "", "")
print(Table.A6, quote = FALSE)

#### Online Appendix --- ROBUSTNESS CHECKS --- Birth Place ####
## pp. 14-15
dataset.C.2013 <- subset(dataset.C, election == 23)  # use the data of the 2013 election only

# number of observations
nrow(dataset.C.2013)

# number of candidates
length(unique(dataset.C.2013$candidate.id))

# number of candidates who has a local candidate of the same surname in his prefecture of birth
sum(dataset.C.2013$same.surname * dataset.C.2013$birth.place)

# estimation
election.2013.result <-felm(log.vote.share ~ same.surname | 
                              candidate.id | 0 | prefecture, dataset.C.2013)

birth.place.result <- felm(log.vote.share ~ same.surname + birth.place | 
                             candidate.id | 0 | prefecture, dataset.C.2013)

# Table A.7
Table.A7 <- matrix("", 3, 6)
Table.A7[1, 1] <- sprintf("%5.3f", election.2013.result$coefficients)
Table.A7[1, 2] <- significance(election.2013.result$cpval)
Table.A7[1, 3] <- paste0("(", sprintf("%5.3f", election.2013.result$cse), ")")
Table.A7[1, 4] <- sprintf("%5.3f", birth.place.result$coefficients[1])
Table.A7[1, 5] <- significance(birth.place.result$cpval[1])
Table.A7[1, 6] <- paste0("(", sprintf("%5.3f", birth.place.result$cse[1]), ")")
Table.A7[2, 4] <- sprintf("%5.3f", birth.place.result$coefficients[2])
Table.A7[2, 5] <- significance(birth.place.result$cpval[2])
Table.A7[2, 6] <- paste0("(", sprintf("%5.3f", birth.place.result$cse[2]), ")")
Table.A7[3, 1] <- Table.A7[3, 4] <- nrow(dataset.C.2013)
rownames(Table.A7) <- c("Same Surname", "Birth Place", "N")
colnames(Table.A7) <- c("Equation (A.2)", "", "", "Equation (A.3)", "", "")
print(Table.A7, quote = FALSE)

#### Online Appendix --- ROBUSTNESS CHECKS --- Incumbency ####
## whether there are coattails from local incumbents who share the same name as new national candidates (pp. 15-16)
dataset.C.nonincumbent <- subset(dataset.C, incumbency == 0)  # use the data of non-incumbents only

# number of observations
nrow(dataset.C.nonincumbent)

# estimation
fellow.local.incumbent.result <- felm(log.vote.share ~ same.surname + same.surname:fellow.local.incumbent | 
                                        candidate.year.id + surname.prefecture.id | 0 | prefecture.year.id, 
                                      dataset.C.nonincumbent)

# Table A.8
Table.A8 <- matrix("", 3, 3)
Table.A8[1, 1] <- sprintf("%5.3f", fellow.local.incumbent.result$coefficients[1])
Table.A8[1, 2] <- significance(fellow.local.incumbent.result$cpval[1])
Table.A8[1, 3] <- paste0("(", sprintf("%5.3f", fellow.local.incumbent.result$cse[1]), ")")
Table.A8[2, 1] <- sprintf("%5.3f", fellow.local.incumbent.result$coefficients[2])
Table.A8[2, 2] <- significance(fellow.local.incumbent.result$cpval[2])
Table.A8[2, 3] <- paste0("(", sprintf("%5.3f", fellow.local.incumbent.result$cse[2]), ")")
Table.A8[3, 1] <- nrow(dataset.C.nonincumbent)
rownames(Table.A8) <- c("Same Surname", "Same Surname * Fellow Local Incumbent", "N")
print(Table.A8, quote = FALSE)

## whether there are coattails from national incumbents who share the same name as new local candidates (p. 16)
dataset.B.nonincumbent <- subset(dataset.B, incumbency == 0)  # use the data of non-incumbents only

# number of observations
nrow(dataset.B.nonincumbent)

## estimation
fellow.national.incumbent.result <- felm(log.local.vote.share ~ local.same.surname + 
                                           local.same.surname:fellow.national.incumbent + 
                                           log.number.of.local.seats | 
                                           party + election + surname.id | 0 | prefecture.year.id, 
                                         dataset.B.nonincumbent)

# Table A.9
Table.A9 <- matrix("", 4, 3)
Table.A9[1, 1] <- sprintf("%5.3f", fellow.national.incumbent.result$coefficients[1])
Table.A9[1, 2] <- significance(fellow.national.incumbent.result$cpval[1])
Table.A9[1, 3] <- paste0("(", sprintf("%5.3f", fellow.national.incumbent.result$cse[1]), ")")
Table.A9[2, 1] <- sprintf("%5.3f", fellow.national.incumbent.result$coefficients[3])
Table.A9[2, 2] <- significance(fellow.national.incumbent.result$cpval[3])
Table.A9[2, 3] <- paste0("(", sprintf("%5.3f", fellow.national.incumbent.result$cse[3]), ")")
Table.A9[3, 1] <- sprintf("%5.3f", fellow.national.incumbent.result$coefficients[2])
Table.A9[3, 2] <- significance(fellow.national.incumbent.result$cpval[2])
Table.A9[3, 3] <- paste0("(", sprintf("%5.3f", fellow.national.incumbent.result$cse[2]), ")")
Table.A9[4, 1] <- nrow(dataset.B.nonincumbent)
rownames(Table.A9) <- c("Local Same Surname", "Local Same Surname * Fellow National Incumbent", 
                        "log(Number Of Local Seats)", "N")
print(Table.A9, quote = FALSE)

#### Online Appendix --- COMPARISON BETWEEN SNTV AND OLPR ####
## exogeneity check 1 (p. 17)
# omit parties all of whose national candidates take on the same value of National Same Surname (see fn. 5 of the Online Appendix)
dataset.A.SNTV <- subset(dataset.A, election < 13)
average.national.same.surname <- tapply(dataset.A.SNTV$national.same.surname, dataset.A.SNTV$party, mean)  # average of National Same Surname for each party
average.national.same.surname[which(average.national.same.surname == 0 | average.national.same.surname == 1)]  # parties such that average of National Same Surname is zero or one
dataset.A.for.analysis.SNTV <- subset(dataset.A.SNTV, ! party %in% c(19, 22, 23, 39, 57))  # omit parties Nos. 19, 22, 23, 39, and 57

dataset.A.OLPR <- subset(dataset.A, election > 18)
average.national.same.surname <- tapply(dataset.A.OLPR$national.same.surname, dataset.A.OLPR$party, mean)  # average of National Same Surname for each party
average.national.same.surname[which(average.national.same.surname == 0 | average.national.same.surname == 1)]  # parties such that average of National Same Surname is zero or one
dataset.A.for.analysis.OLPR <- subset(dataset.A.OLPR, ! party %in% c(79))  # omit parties No. 79

# number of observations
nrow(dataset.A.for.analysis.SNTV)
nrow(dataset.A.for.analysis.OLPR)

# estimation
exogeneity.check.1.SNTV.result <- glm(national.same.surname ~ national.vote.share + 
                                        as.factor(party) + as.factor(election) - 1, 
                                      family = binomial, dataset.A.for.analysis.SNTV)

exogeneity.check.1.OLPR.result <- glm(national.same.surname ~ national.vote.share + 
                                        as.factor(party) + as.factor(election) - 1, 
                                      family = binomial, dataset.A.for.analysis.OLPR)

# compute the clustered robust standard error by year
g.SNTV <- estfun(exogeneity.check.1.SNTV.result)
h.SNTV <- vcov(exogeneity.check.1.SNTV.result)
g.cluster.SNTV <- matrix(NA, length(unique(dataset.A.for.analysis.SNTV$election)), 
                         length(exogeneity.check.1.SNTV.result$coefficients))
for (i in 1:ncol(g.cluster.SNTV)) {
  g.cluster.SNTV[, i] <- tapply(g.SNTV[, i], dataset.A.for.analysis.SNTV$election, sum)
}
clustered.vcov.SNTV <- h.SNTV %*% 
  ((length(unique(dataset.A.for.analysis.SNTV$election)) / 
      (length(unique(dataset.A.for.analysis.SNTV$election)) - 1)) * 
     crossprod(g.SNTV)) %*% h.SNTV
exogeneity.check.1.SNTV.coefficient <- exogeneity.check.1.SNTV.result$coefficients[1]
exogeneity.check.1.SNTV.clustered.SE <- sqrt(diag(clustered.vcov.SNTV))[1]
exogeneity.check.1.SNTV.p.value <- 2 * (pnorm(-abs(exogeneity.check.1.SNTV.coefficient / 
                                                     exogeneity.check.1.SNTV.clustered.SE)))

g.OLPR <- estfun(exogeneity.check.1.OLPR.result)
h.OLPR <- vcov(exogeneity.check.1.OLPR.result)
g.cluster.OLPR <- matrix(NA, length(unique(dataset.A.for.analysis.OLPR$election)), 
                         length(exogeneity.check.1.OLPR.result$coefficients))
for (i in 1:ncol(g.cluster.OLPR)) {
  g.cluster.OLPR[, i] <- tapply(g.OLPR[, i], dataset.A.for.analysis.OLPR$election, sum)
}
clustered.vcov.OLPR <- h.OLPR %*% 
  ((length(unique(dataset.A.for.analysis.OLPR$election)) / 
      (length(unique(dataset.A.for.analysis.OLPR$election)) - 1)) * 
     crossprod(g.OLPR)) %*% h.OLPR
exogeneity.check.1.OLPR.coefficient <- exogeneity.check.1.OLPR.result$coefficients[1]
exogeneity.check.1.OLPR.clustered.SE <- sqrt(diag(clustered.vcov.OLPR))[1]
exogeneity.check.1.OLPR.p.value <- 2 * (pnorm(-abs(exogeneity.check.1.OLPR.coefficient / 
                                                     exogeneity.check.1.OLPR.clustered.SE)))

# Table A.10
Table.A10 <- matrix("", 2, 6)
Table.A10[1, 1] <- sprintf("%5.3f", exogeneity.check.1.SNTV.coefficient)
Table.A10[1, 2] <- significance(exogeneity.check.1.SNTV.p.value)
Table.A10[1, 3] <- paste0("(", sprintf("%5.3f", exogeneity.check.1.SNTV.clustered.SE), ")")
Table.A10[2, 1] <- nrow(dataset.A.for.analysis.SNTV)
Table.A10[1, 4] <- sprintf("%5.3f", exogeneity.check.1.OLPR.coefficient)
Table.A10[1, 5] <- significance(exogeneity.check.1.OLPR.p.value)
Table.A10[1, 6] <- paste0("(", sprintf("%5.3f", exogeneity.check.1.OLPR.clustered.SE), ")")
Table.A10[2, 4] <- nrow(dataset.A.for.analysis.OLPR)
rownames(Table.A10) <- c("National Vote Share", "N")
colnames(Table.A10) <- c("SNTV", "", "", "OLPR", "", "")
print(Table.A10, quote = FALSE)

## exogeneity check 2 (p. 18)
dataset.B.SNTV <- subset(dataset.B, election < 13)
dataset.B.OLPR <- subset(dataset.B, election > 18)

# number of observations
nrow(dataset.B.SNTV)
nrow(dataset.B.OLPR)

# estimation
exogeneity.check.2.SNTV.result <- felm(log.local.vote.share ~ local.same.surname + 
                                         log.number.of.local.seats | 
                                         party + election + surname.id | 0 | prefecture.year.id, dataset.B.SNTV)

exogeneity.check.2.OLPR.result <- felm(log.local.vote.share ~ local.same.surname + 
                                         log.number.of.local.seats | 
                                         party + election + surname.id | 0 | prefecture.year.id, dataset.B.OLPR)

# results
summary(exogeneity.check.2.SNTV.result)
summary(exogeneity.check.2.OLPR.result)

# Table A.11
Table.A11 <- matrix("", 3, 6)
Table.A11[1, 1] <- sprintf("%5.3f", exogeneity.check.2.SNTV.result$coefficients[1])
Table.A11[1, 2] <- significance(exogeneity.check.2.SNTV.result$cpval[1])
Table.A11[1, 3] <- paste0("(", sprintf("%5.3f", exogeneity.check.2.SNTV.result$cse[1]), ")")
Table.A11[2, 1] <- sprintf("%5.3f", exogeneity.check.2.SNTV.result$coefficients[2])
Table.A11[2, 2] <- significance(exogeneity.check.2.SNTV.result$cpval[2])
Table.A11[2, 3] <- paste0("(", sprintf("%5.3f", exogeneity.check.2.SNTV.result$cse[2]), ")")
Table.A11[3, 1] <- nrow(dataset.B.SNTV)
Table.A11[1, 4] <- sprintf("%5.3f", exogeneity.check.2.OLPR.result$coefficients[1])
Table.A11[1, 5] <- significance(exogeneity.check.2.OLPR.result$cpval[1])
Table.A11[1, 6] <- paste0("(", sprintf("%5.3f", exogeneity.check.2.OLPR.result$cse[1]), ")")
Table.A11[2, 4] <- sprintf("%5.3f", exogeneity.check.2.OLPR.result$coefficients[2])
Table.A11[2, 5] <- significance(exogeneity.check.2.OLPR.result$cpval[2])
Table.A11[2, 6] <- paste0("(", sprintf("%5.3f", exogeneity.check.2.OLPR.result$cse[2]), ")")
Table.A11[3, 4] <- nrow(dataset.B.OLPR)
rownames(Table.A11) <- c("Local Same Surname", "log(Number Of Local Seats)", "N")
colnames(Table.A11) <- c("SNTV", "", "", "OLPR", "", "")
print(Table.A11, quote = FALSE)

## effects of shared surname on vote share for SNTV and OLPR (pp. 18-19)
dataset.C.SNTV <- subset(dataset.C, election < 13)
dataset.C.OLPR <- subset(dataset.C, election > 18)

# number of observations
nrow(dataset.C.SNTV)
nrow(dataset.C.OLPR)

# estimation
main.SNTV.result <- felm(log.vote.share ~ same.surname | 
                           candidate.year.id + surname.prefecture.id | 0 | prefecture.year.id, 
                         dataset.C.SNTV)

main.OLPR.result <- felm(log.vote.share ~ same.surname | 
                           candidate.year.id + surname.prefecture.id | 0 | prefecture.year.id, 
                         dataset.C.OLPR)

# exponentiated coefficient estimates
high.low.ratio.SNTV <- exp(main.SNTV.result$coefficients)
round(high.low.ratio.SNTV, 2)

high.low.ratio.OLPR <- exp(main.OLPR.result$coefficients)
round(high.low.ratio.OLPR, 2)

# 95% CI of the exponentiated coefficients
CI.lower.SNTV <- exp(main.SNTV.result$coef + qnorm(0.025) * main.SNTV.result$cse)
round(CI.lower.SNTV, 2)  # lower bound of the 95% CI
CI.upper.SNTV <- exp(main.SNTV.result$coef + qnorm(0.975) * main.SNTV.result$cse)
round(CI.upper.SNTV, 2)  # upper bound of the 95% CI

CI.lower.OLPR <- exp(main.OLPR.result$coef + qnorm(0.025) * main.OLPR.result$cse)
round(CI.lower.OLPR, 2)  # lower bound of the 95% CI
CI.upper.OLPR <- exp(main.OLPR.result$coef + qnorm(0.975) * main.OLPR.result$cse)
round(CI.upper.OLPR, 2)  # upper bound of the 95% CI

# Figure A.4
vote.share.low.prefecture.SNTV <- mean(subset(dataset.C.SNTV, same.surname == 0)$vote.share)
vote.share.low.prefecture.OLPR <- mean(subset(dataset.C.OLPR, same.surname == 0)$vote.share)

pdf("Figure_A4.pdf", width = 6, height = 3, paper = "special", pointsize = 7)
layout(matrix(c(1, 2), 1, 2))
par(mar = c(5, 6, 4, 2))
plot(NULL, NULL, type = "n", bty = "l", xlim = c(0.5, 2.5), ylim = c(0, 1.3), xlab = "Name recognition", 
     ylab = "Vote Share (%)", main = "SNTV (1947-1980)", xaxt = "n", yaxt = "n")
abline(h = 0, lty = 3)
polygon(c(0.7, 1.3, 1.3, 0.7), c(0, 0, vote.share.low.prefecture.SNTV, vote.share.low.prefecture.SNTV), 
        col = gray(0.8))
polygon(c(1.7, 2.3, 2.3, 1.7), 
        c(0, 0, vote.share.low.prefecture.SNTV * high.low.ratio.SNTV, 
          vote.share.low.prefecture.SNTV * high.low.ratio.SNTV), col = gray(0.5))
arrows(2, vote.share.low.prefecture.SNTV * CI.lower.SNTV, 
       2, vote.share.low.prefecture.SNTV * CI.upper.SNTV, 
       length = 0.02, angle = 90, code = 3)
axis(1, at = c(1, 2), labels = c("Low", "High"))
axis(2, at = c(0, 0.5, 1, 1.5))
plot(NULL, NULL, type = "n", bty = "l", xlim = c(0.5, 2.5), ylim = c(0, 1.3), xlab = "Name recognition", 
     ylab = "Vote Share (%)", main = "OLPR (2001-2013)", xaxt = "n", yaxt = "n")
abline(h = 0, lty = 3)
polygon(c(0.7, 1.3, 1.3, 0.7), c(0, 0, vote.share.low.prefecture.OLPR, vote.share.low.prefecture.OLPR), 
        col = gray(0.8))
polygon(c(1.7, 2.3, 2.3, 1.7), 
        c(0, 0, vote.share.low.prefecture.OLPR * high.low.ratio.OLPR, 
          vote.share.low.prefecture.OLPR * high.low.ratio.OLPR), col = gray(0.5))
arrows(2, vote.share.low.prefecture.OLPR * CI.lower.OLPR, 
       2, vote.share.low.prefecture.OLPR * CI.upper.OLPR, 
       length = 0.02, angle = 90, code = 3)
axis(1, at = c(1, 2), labels = c("Low", "High"))
axis(2, at = c(0, 0.5, 1, 1.5))
dev.off()